Correlation between functional selections on humanDAG1 and mastomysDAG1 cells¶
Compare functional scores measured on cells expressing humanDAG1 and mastomysDAG1
In [1]:
# Imports
import os
import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import altair as alt
# Allow more rows for Altair
_ = alt.data_transformers.disable_max_rows()
File paths for data:
In [2]:
# this cell is tagged as `parameters` for papermill parameterization
HEK293T_data_path = None
humanDAG1_data_path = None
mastomysDAG1_data_path = None
# Minimum times seen filter
MTS = None
n_selections = None
html_dir = None
html_output = None
In [3]:
# Parameters
HEK293T_data_path = "results/func_effects/averages/293T_entry_func_effects.csv"
humanDAG1_data_path = "results/func_effects/averages/human_293T_entry_func_effects.csv"
mastomysDAG1_data_path = (
"results/func_effects/averages/mastomys_293T_entry_func_effects.csv"
)
MTS = 2
n_selections = 8
html_dir = "results/DAG1_ortholog_correlations/"
html_output = "results/DAG1_ortholog_correlations/DAG1_ortholog_correlations.html"
In [4]:
# # Uncomment for running interactive
# HEK293T_data_path = "../results/func_effects/averages/293T_entry_func_effects.csv"
# humanDAG1_data_path = "../results/func_effects/averages/human_293T_entry_func_effects.csv"
# mastomysDAG1_data_path = "../results/func_effects/averages/mastomys_293T_entry_func_effects.csv"
# # Minimum times seen filter
# MTS = 2
# n_selections = 8
# html_dir = "../results/DAG1_ortholog_correlations/"
# html_output = "../results/DAG1_ortholog_correlations/DAG1_ortholog_correlations.html"
In [5]:
# Read data
hek_df = pd.read_csv(HEK293T_data_path)
human_df = pd.read_csv(humanDAG1_data_path)
mastomys_df = pd.read_csv(mastomysDAG1_data_path)
# Merge data on intersection of measured values
merged_df = (
human_df.merge(
mastomys_df,
how="inner",
on=["site", "wildtype", "mutant"],
suffixes=["_human", "_mastomys"],
validate="one_to_one",
)
.merge(
hek_df,
how="inner",
on=["site", "wildtype", "mutant"],
validate="one_to_one",
)
)
merged_df = merged_df.rename(columns={
"effect" : "effect_HEK293T",
"times_seen" : "times_seen_HEK293T",
"n_selections" : "n_selections_HEK293T",
})
# Add average times seen column
merged_df["average_times_seen"] = merged_df[["times_seen_human", "times_seen_mastomys", "times_seen_HEK293T"]].mean(axis=1)
# Filter for number of selections
merged_df = (
merged_df.loc[
(merged_df["n_selections_human"] == n_selections)
&
(merged_df["n_selections_mastomys"] == n_selections)
&
(merged_df["n_selections_HEK293T"] == n_selections)
]
)
merged_df
Out[5]:
| site | wildtype | mutant | effect_human | times_seen_human | n_selections_human | effect_mastomys | times_seen_mastomys | n_selections_mastomys | effect_HEK293T | times_seen_HEK293T | n_selections_HEK293T | average_times_seen | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | M | * | -4.5400 | 39.620 | 8 | -3.78400 | 40.120 | 8 | -4.3130 | 38.500 | 8 | 39.413333 |
| 1 | 1 | M | A | -3.6680 | 9.500 | 8 | -3.41000 | 9.250 | 8 | -4.2490 | 9.375 | 8 | 9.375000 |
| 2 | 1 | M | C | -3.4210 | 15.120 | 8 | -3.27100 | 15.380 | 8 | -3.8490 | 14.000 | 8 | 14.833333 |
| 3 | 1 | M | D | -3.5640 | 13.000 | 8 | -3.34900 | 13.380 | 8 | -4.0930 | 12.000 | 8 | 12.793333 |
| 4 | 1 | M | E | -3.9870 | 6.125 | 8 | -3.64400 | 5.750 | 8 | -4.1490 | 6.750 | 8 | 6.208333 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 10345 | 491 | R | V | -0.5255 | 7.250 | 8 | -0.73620 | 7.125 | 8 | -0.8019 | 7.125 | 8 | 7.166667 |
| 10346 | 491 | R | W | -1.6390 | 9.000 | 8 | -1.70400 | 8.750 | 8 | -1.9180 | 8.500 | 8 | 8.750000 |
| 10347 | 491 | R | Y | -1.1100 | 4.750 | 8 | 0.01844 | 5.000 | 8 | -1.7270 | 4.750 | 8 | 4.833333 |
| 10348 | 492 | * | * | 0.0000 | NaN | 8 | 0.00000 | NaN | 8 | 0.0000 | NaN | 8 | NaN |
| 10350 | 492 | * | L | 0.2745 | 2.000 | 8 | 0.24450 | 2.000 | 8 | 0.4992 | 1.875 | 8 | 1.958333 |
9820 rows × 13 columns
Plot correlation of scores with an interactive plot
In [6]:
# Calculate statistics
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation human vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation human vs HEK (min_times_seen={MTS}): {r**2:.2f}")
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation mastomys vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs HEK (min_times_seen={MTS}): {r**2:.2f}")
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"])
print(f"r correlation mastomys vs human (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs human (min_times_seen={MTS}): {r**2:.2f}")
slider = alt.binding_range(min=1, max=25, step=1, name="times_seen")
selector = alt.param(name="SelectorName", value=MTS, bind=slider)
# Plot data
human_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
alt.X(
"effect_human",
axis=alt.Axis(
title=["functional score measured in", "293T\u0394DAG1+humanDAG1 cells"],
values=[-6, -4, -2, 0, 2],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-6,2])
),
alt.Y(
"effect_HEK293T",
axis=alt.Axis(
title="functional score measured in 293T cells",
values=[-6, -4, -2, 0, 2],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-6,2])
),
opacity=alt.condition(
alt.datum.average_times_seen < selector,
alt.value(0),
alt.value(0.15)
),
tooltip=[
"site",
"wildtype",
"mutant",
"effect_human",
"times_seen_human",
"n_selections_human",
"effect_HEK293T",
"times_seen_HEK293T",
"n_selections_HEK293T"
],
).properties(
width=300,
height=300
)
mastomys_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
alt.X(
"effect_mastomys",
axis=alt.Axis(
title=["functional score measured in", "293T\u0394DAG1+mastomysDAG1 cells"],
values=[-6, -4, -2, 0, 2],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-6,2])
),
alt.Y(
"effect_HEK293T",
axis=alt.Axis(
title="functional score measured in 293T cells",
values=[-6, -4, -2, 0, 2],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-6,2])
),
opacity=alt.condition(
alt.datum.average_times_seen < selector,
alt.value(0),
alt.value(0.15)
),
tooltip=[
"site",
"wildtype",
"mutant",
"effect_mastomys",
"times_seen_mastomys",
"n_selections_mastomys",
"effect_HEK293T",
"times_seen_HEK293T",
"n_selections_HEK293T",
],
).properties(
width=300,
height=300
)
mastomys_vs_human = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
alt.X(
"effect_human",
axis=alt.Axis(
title=["functional score measured in", "293T\u0394DAG1+humanDAG1 cells"],
values=[-6, -4, -2, 0, 2],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-6,2])
),
alt.Y(
"effect_mastomys",
axis=alt.Axis(
title=["functional score measured in", "293T\u0394DAG1+mastomysDAG1 cells"],
values=[-6, -4, -2, 0, 2],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-6,2])
),
opacity=alt.condition(
alt.datum.average_times_seen < selector,
alt.value(0),
alt.value(0.15)
),
tooltip=[
"site",
"wildtype",
"mutant",
"effect_human",
"times_seen_human",
"n_selections_human",
"effect_mastomys",
"times_seen_mastomys",
"n_selections_mastomys"
],
).properties(
width=300,
height=300
)
corr_chart = alt.hconcat(
human_vs_hek,
mastomys_vs_hek,
mastomys_vs_human,
spacing=5
).add_params(
selector
).configure_axis(
grid=False,
labelFontSize=16,
titleFontSize=16,
)
# Make output dir if doesn't exist
if not os.path.exists(html_dir):
os.mkdir(html_dir)
print(f"Saving to {html_output}")
corr_chart.save(html_output)
corr_chart
r correlation human vs HEK (min_times_seen=2): 0.93 r^2 correlation human vs HEK (min_times_seen=2): 0.87 r correlation mastomys vs HEK (min_times_seen=2): 0.93 r^2 correlation mastomys vs HEK (min_times_seen=2): 0.87 r correlation mastomys vs human (min_times_seen=2): 0.94 r^2 correlation mastomys vs human (min_times_seen=2): 0.89 Saving to results/DAG1_ortholog_correlations/DAG1_ortholog_correlations.html
Out[6]:
Recreate same plot as above but reduce font sizes for a figure in a manuscript
In [7]:
# Calculate statistics
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation human vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation human vs HEK (min_times_seen={MTS}): {r**2:.2f}")
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_HEK293T"])
print(f"r correlation mastomys vs HEK (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs HEK (min_times_seen={MTS}): {r**2:.2f}")
r, p = sp.stats.pearsonr(merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_mastomys"], merged_df.loc[merged_df["average_times_seen"] >= MTS]["effect_human"])
print(f"r correlation mastomys vs human (min_times_seen={MTS}): {r:.2f}")
print(f"r^2 correlation mastomys vs human (min_times_seen={MTS}): {r**2:.2f}")
slider = alt.binding_range(min=1, max=25, step=1, name="times_seen")
selector = alt.param(name="SelectorName", value=MTS, bind=slider)
# Plot data
human_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
alt.X(
"effect_human",
axis=alt.Axis(
title=["functional score measured in", "293T\u0394DAG1+humanDAG1 cells"],
values=[-6, -4, -2, 0, 2],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-6,2])
),
alt.Y(
"effect_HEK293T",
axis=alt.Axis(
title="functional score measured in 293T cells",
values=[-6, -4, -2, 0, 2],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-6,2])
),
opacity=alt.condition(
alt.datum.average_times_seen < selector,
alt.value(0),
alt.value(0.15)
),
tooltip=[
"site",
"wildtype",
"mutant",
"effect_human",
"times_seen_human",
"n_selections_human",
"effect_HEK293T",
"times_seen_HEK293T",
"n_selections_HEK293T"
],
).properties(
width=115,
height=115
)
mastomys_vs_hek = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
alt.X(
"effect_mastomys",
axis=alt.Axis(
title=["functional score measured in", "293T\u0394DAG1+mastomysDAG1 cells"],
values=[-6, -4, -2, 0, 2],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-6,2])
),
alt.Y(
"effect_HEK293T",
axis=alt.Axis(
title="functional score measured in 293T cells",
values=[-6, -4, -2, 0, 2],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-6,2])
),
opacity=alt.condition(
alt.datum.average_times_seen < selector,
alt.value(0),
alt.value(0.15)
),
tooltip=[
"site",
"wildtype",
"mutant",
"effect_mastomys",
"times_seen_mastomys",
"n_selections_mastomys",
"effect_HEK293T",
"times_seen_HEK293T",
"n_selections_HEK293T",
],
).properties(
width=115,
height=115
)
mastomys_vs_human = alt.Chart(merged_df).mark_point(filled=True, color="black").encode(
alt.X(
"effect_human",
axis=alt.Axis(
title=["functional score measured in", "293T\u0394DAG1+humanDAG1 cells"],
values=[-6, -4, -2, 0, 2],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-6,2])
),
alt.Y(
"effect_mastomys",
axis=alt.Axis(
title=["functional score measured in", "293T\u0394DAG1+mastomysDAG1 cells"],
values=[-6, -4, -2, 0, 2],
domainWidth=1,
domainColor="black",
tickColor="black",
),
scale=alt.Scale(domain=[-6,2])
),
opacity=alt.condition(
alt.datum.average_times_seen < selector,
alt.value(0),
alt.value(0.15)
),
tooltip=[
"site",
"wildtype",
"mutant",
"effect_human",
"times_seen_human",
"n_selections_human",
"effect_mastomys",
"times_seen_mastomys",
"n_selections_mastomys"
],
).properties(
width=115,
height=115
)
corr_chart = alt.hconcat(
human_vs_hek,
mastomys_vs_hek,
mastomys_vs_human,
spacing=5
).add_params(
selector
).configure_axis(
grid=False,
labelFontSize=8,
titleFontSize=8,
).configure_point(
size=10
)
corr_chart
r correlation human vs HEK (min_times_seen=2): 0.93 r^2 correlation human vs HEK (min_times_seen=2): 0.87 r correlation mastomys vs HEK (min_times_seen=2): 0.93 r^2 correlation mastomys vs HEK (min_times_seen=2): 0.87 r correlation mastomys vs human (min_times_seen=2): 0.94 r^2 correlation mastomys vs human (min_times_seen=2): 0.89
Out[7]: